source("config.R")
## Loading required package: pacman
installs and loads necessary packages
Let’s start with two data sets that have home price information by Zip Code from Zillow as well as a little U.S. Census Bureau data. More specifically, there’s median single-family home value in one and median price per square foot in the other, by Zip Code in 4 eastern Massachusetts counties: Suffolk (mostly Boston), Middlesex (just to the west), Essex (just to the north), and Norfolk (Brookline & others)
I’ll read them in with the readr package, which is good at keeping zip codes as character strings and not numbers (as numbers, here in New England the initial 0 gets stripped out)
price_median <- readr::read_csv("data/zillow_data_median_sf_price.csv")
## Parsed with column specification:
## cols(
## ZipCode = col_character(),
## City = col_character(),
## County = col_character(),
## TotalPopulation = col_double(),
## MedianSFPrice = col_double(),
## PctNonLatinxWhite = col_double()
## )
price_sqft <- readr::read_csv("data/zillow_data_price_per_sq_ft.csv")
## Parsed with column specification:
## cols(
## ZipCode = col_character(),
## City = col_character(),
## County = col_character(),
## TotalPopulation = col_double(),
## PricePerSqFt = col_double(),
## PctNonLatinxWhite = col_double()
## )
Before doing any analysis, it’s always good to take a few basic steps to get familiar with the data set. Let’s look at the price_median data set.
head(price_median)
## # A tibble: 6 x 6
## ZipCode City County TotalPopulation MedianSFPrice PctNonLatinxWhi~
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 01431 Ashby Middlesex 3195 271400 94.9
## 2 01432 Ayer Middlesex 7703 354700 80
## 3 01450 Groton Middlesex 11282 497500 92.5
## 4 01460 Littlet~ Middlesex 9754 479900 89
## 5 01463 Peppere~ Middlesex 12049 359400 91.8
## 6 01464 Shirley Middlesex 7572 347100 68.4
tail(price_median) # make sure there's no total row at the end to screw things up
## # A tibble: 6 x 6
## ZipCode City County TotalPopulation MedianSFPrice PctNonLatinxWhi~
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 02481 Wellesley Norfolk 16309 1374500 80.3
## 2 02482 Wellesley Norfolk 11015 1190300 78.6
## 3 02492 Needham Norfolk 20128 968400 87.6
## 4 02493 Weston Middles~ 12027 1536800 79.7
## 5 02494 Needham Norfolk 10301 871300 78.1
## 6 02762 Plainvil~ Norfolk 9000 387000 95.3
Check out the structure
str(price_median)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 173 obs. of 6 variables:
## $ ZipCode : chr "01431" "01432" "01450" "01460" ...
## $ City : chr "Ashby" "Ayer" "Groton" "Littleton" ...
## $ County : chr "Middlesex" "Middlesex" "Middlesex" "Middlesex" ...
## $ TotalPopulation : num 3195 7703 11282 9754 12049 ...
## $ MedianSFPrice : num 271400 354700 497500 479900 359400 ...
## $ PctNonLatinxWhite: num 94.9 80 92.5 89 91.8 68.4 93.8 93.2 79.4 69.7 ...
## - attr(*, "spec")=
## .. cols(
## .. ZipCode = col_character(),
## .. City = col_character(),
## .. County = col_character(),
## .. TotalPopulation = col_double(),
## .. MedianSFPrice = col_double(),
## .. PctNonLatinxWhite = col_double()
## .. )
OR
dplyr::glimpse(price_median)
## Observations: 173
## Variables: 6
## $ ZipCode <chr> "01431", "01432", "01450", "01460", "01463",...
## $ City <chr> "Ashby", "Ayer", "Groton", "Littleton", "Pep...
## $ County <chr> "Middlesex", "Middlesex", "Middlesex", "Midd...
## $ TotalPopulation <dbl> 3195, 7703, 11282, 9754, 12049, 7572, 7454, ...
## $ MedianSFPrice <dbl> 271400, 354700, 497500, 479900, 359400, 3471...
## $ PctNonLatinxWhite <dbl> 94.9, 80.0, 92.5, 89.0, 91.8, 68.4, 93.8, 93...
summary(price_median)
## ZipCode City County TotalPopulation
## Length:173 Length:173 Length:173 Min. : 1964
## Class :character Class :character Class :character 1st Qu.:10676
## Mode :character Mode :character Mode :character Median :18697
## Mean :21118
## 3rd Qu.:28505
## Max. :61375
## MedianSFPrice PctNonLatinxWhite
## Min. : 271400 Min. : 2.60
## 1st Qu.: 445800 1st Qu.:68.40
## Median : 567400 Median :79.70
## Mean : 664831 Mean :74.45
## 3rd Qu.: 739900 3rd Qu.:89.40
## Max. :2200600 Max. :97.70
Some more substantial summaries
skimr::skim(price_median)
Hmisc::describe(price_median)
## price_median
##
## 6 Variables 173 Observations
## ---------------------------------------------------------------------------
## ZipCode
## n missing distinct
## 173 0 173
##
## lowest : 01431 01432 01450 01460 01463, highest: 02482 02492 02493 02494 02762
## ---------------------------------------------------------------------------
## City
## n missing distinct
## 173 0 116
##
## lowest : Acton Amesbury Andover Arlington Ashby
## highest: Wilmington Winchester Winthrop Woburn Wrentham
## ---------------------------------------------------------------------------
## County
## n missing distinct
## 173 0 4
##
## Value Essex Middlesex Norfolk Suffolk
## Frequency 38 74 38 23
## Proportion 0.220 0.428 0.220 0.133
## ---------------------------------------------------------------------------
## TotalPopulation
## n missing distinct Info Mean Gmd .05 .10
## 173 0 173 1 21118 14622 4367 5984
## .25 .50 .75 .90 .95
## 10676 18697 28505 38944 47280
##
## lowest : 1964 2270 2812 3155 3195, highest: 53873 55074 56541 59779 61375
## ---------------------------------------------------------------------------
## MedianSFPrice
## n missing distinct Info Mean Gmd .05 .10
## 173 0 171 1 664831 347931 335340 358840
## .25 .50 .75 .90 .95
## 445800 567400 739900 1144500 1365740
##
## lowest : 271400 279100 283500 293600 298200
## highest: 1788600 1919400 1987800 2122600 2200600
## ---------------------------------------------------------------------------
## PctNonLatinxWhite
## n missing distinct Info Mean Gmd .05 .10
## 173 0 154 1 74.45 21.2 29.62 46.18
## .25 .50 .75 .90 .95
## 68.40 79.70 89.40 92.96 95.34
##
## lowest : 2.6 4.2 10.3 12.0 16.4, highest: 96.6 96.9 97.2 97.4 97.7
## ---------------------------------------------------------------------------
Visualizations make it much easier to see things like distributions and outliers.
Don’t know what type of visualization to use for exploring the data you have? https://www.data-to-viz.com gives you suggestions based on the type of data, plus code and advice. Super useful.
Let’s first look at just current median price. One numerical. We can do a histogram.
I’m going to show you both base R built-in graphics and the most popular R package for visualization, widely used among journalists and others, called ggplot2. It’s mostly a matter of personal preference which you want to use. I use both, sometimes base R for ‘quick and dirty’ and ggplot when I want to do more customization.
Quick base R histogram – hist() function. It will try to guess good breaks for your bars.
hist(price_median$MedianSFPrice)
With headline and better x axis label. main for headline, xlab for x axis
hist(price_median$MedianSFPrice , main = "Zillow median home prices, March 2019", xlab = "Price")
ggplot2 is a little more involved.
The first line only sets up what the data frame is, what the x and y values should be, and if you want to group by a category or color-fill by a category. The first line doesn’t actually display anything.
The second line says what kind of viz you want. Syntax is
ggplot(mydata, aes(x = xcolumn, y = ycolumn)) +
geom_viztype()
In this case we only have one value so don’t need to specify x and y
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot2’s histogram defaults to 30 bins but scolds you to pick something with better intention. Although that looks decent to me, we can try out some others.
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 50000)
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 100000)
There is a LOT more customizing you can do. Here I’ll make the bars blue outlined in white.
# Change the bar fill color and outline color
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")
I’ll also add a headline and change the default X axis label with ggtitle and xlab.
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 50000, fill = "darkblue", color = "white") +
ggtitle("Zillow median home prices, March 2019") +
xlab("Price")
Add a headline and x-axis label, change the theme
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")+
ggtitle("Zillow median home prices, March 2019") +
xlab("Price") +
theme_light()
Add dollar signs and commas to the axis
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")+
ggtitle("Zillow median home prices, March 2019") +
xlab("Price") +
theme_light() +
scale_x_continuous(labels = dollar)
You can play with lots of themes and the ggthemes package :-)
ggplot(price_median, aes(MedianSFPrice)) +
geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")+
labs(title = "Median Home Prices, March 2019",
subtitle = "Source: Zillow Mass. data from Suffolk, Middlesex, Essex, and Norfolk counties "
) +
xlab("Price") +
scale_x_continuous(labels = dollar) +
ggthemes::theme_economist()
But back to our data. Let’s take another look at distribution, this time with a box plot
Base R
boxplot(price_median$MedianSFPrice)
Lots of expensive outliers
ggplot2 - here I use color suggestions from Data to Viz’s R graph gallery.
Since I don’t have an x column but boxplot looks for one, I’m using an empty character string for that and the label
ggplot(price_median, aes(x = "", y = MedianSFPrice)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
xlab("")
Again I add dollar sign and commas to make y axis easier to read
ggplot(price_median, aes(x = "", y = MedianSFPrice)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
xlab("") +
scale_y_continuous(labels = dollar)
A bit easier to see those expensive outliers.
Where ggplot also comes in handy is if we’d like to create multiple boxplots by a category, such as one box plot per county. All we need to do as put county on the X axis!
ggplot(price_median, aes(x = County, y = MedianSFPrice)) +
geom_boxplot(fill="slateblue", alpha=0.2)
That’s a bit of a different look.
What about Boston, Cambridge, and Somerville? Let’s filter our data set for just those 2 cities:
bos_cam_som <- filter(price_median, City %in% c("Boston", "Cambridge", "Somerville"))
ggplot(bos_cam_som, aes(x = City, y = MedianSFPrice)) +
geom_boxplot(fill="slateblue", alpha=0.2)
Well that’s interesting.
Now let’s take a look at median price per square foot in Boston, Cambridge, and Somerville.
bos_cam_som_per_sq_ft <- filter(price_sqft, City %in% c("Boston", "Cambridge", "Somerville"))
ggplot(bos_cam_som_per_sq_ft, aes(x = City, y = PricePerSqFt)) +
geom_boxplot(fill="slateblue", alpha=0.2)
That is incredibly different. Remember that this is looking at ONE MEDIAN PER ZIP CODE, not individual prices, so it’s looking at DIFFERENCES IN NEIGHBORHOOD MEDIANS. Looks like there may be a much greater difference in median home sizes by Zip Code in Boston. Also look at those two extremely pricey Zip Code outliers in Boston.
Median price vs percent white?
base R
plot(price_median$PctNonLatinxWhite, price_median$MedianSFPrice)
Could there be relationship? It’s possible to make this base R look better and add a regression line, but I tend to do that work in ggplot.
Remember, first line of code sets up the data set, x values and y values. Second line says what type of “geom” viz type you want.
ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice)) +
geom_point()
If you’d like to see which dots are what counties, you can color by county:
ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County)) +
geom_point()
Things are a little hard to see with the grey background, I’ll change the theme to get rid of the grid lines altogether and bump up the dot size a bit.
ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County)) +
geom_point(size = 2) +
theme_few() # named for Stephen Few, well known in the data viz field
It would be cool to be able to mouse over the dots and see underlying data, including the city. The plotly package turns a ggplot static plot into a JavaScript one! Just wrap your code in the ggplotly function.
plotly::ggplotly(
ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County)) +
geom_point(size = 2) +
theme_few() # named for Stephen Few, well known in the data viz field
)
I can’t tell you why, but the syntax for adding fields to the tooltip is something like
text = paste("mycolumn:", mycolumn)
ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County, text = paste("City:", City))) +
geom_point(size = 2) +
theme_few() # named for Stephen Few, well known in the data viz field
plotly::ggplotly(
ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County, text = paste("City:", City))) +
geom_point(size = 2) +
theme_few() # named for Stephen Few, well known in the data viz field
)
Double click on the legend to just look at one county
If you want to learn more about plotly and R, check out:
https://plot.ly/r/ https://plotly-r.com/overview.html
Note that you can store a graph in a variable, just like any other value
mygraph <- ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County, text = paste("City:", City))) +
geom_point(size = 2) +
theme_few() # named for Stephen Few, well known in the data viz field
mygraph
plotly::ggplotly(
mygraph
)
Note also you can just use ggplotly() if plotly is loaded in your working session.
I added the plotly:: so you know which package that function comes from.
To add a regression line, lm for linear model. We want to look at all points together, not by county, so back to
mygraph <- ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice)) +
geom_point() +
theme_minimal()
mygraph
Add linear regression line:
mygraph +
geom_smooth(method = lm)
This includes a 95% confidence interval. Not much of a relationship, even when considering the confidence interval. To take out the shaded area, se = FALSE.
mygraph +
geom_smooth(method = lm, se = FALSE)
Let R pick with a method of “auto”:
mygraph +
geom_smooth(method = "auto", se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Now let’s look just at Boston
boston <- filter(price_median, City == "Boston")
ggplot(boston, aes(PctNonLatinxWhite, MedianSFPrice)) +
geom_point(size = 2) +
theme_few() +
geom_smooth(method = lm)
This looks like perhaps more of a relationship Which makes sense, since Boston is generally more expensive than outer suburbs, and many outer suburbs are heavily white. Considering just Boston takes out one important pricing factor: Distance from Boston.
You can see the statistics behind the correlation with cor.test()
cor.test(boston$PctNonLatinxWhite, boston$MedianSFPrice)
##
## Pearson's product-moment correlation
##
## data: boston$PctNonLatinxWhite and boston$MedianSFPrice
## t = 2.2073, df = 18, p-value = 0.04051
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02390675 0.75073338
## sample estimates:
## cor
## 0.4615447
A useful add-on R package for correlations is corr.
It’s still not incredibly strong. One reason could be that Zip Codes don’t necessarily correlate with neighborhoods. However, another reason may be that those two high-priced outliers are affecting the trend. What does it look like for areas where the median price is under a million dollars?
boston2 <- filter(boston, MedianSFPrice < 1000000)
ggplot(boston2, aes(PctNonLatinxWhite, MedianSFPrice)) +
geom_point(size = 2) +
theme_few() +
geom_smooth(method = lm)
That’s pretty dramatic! By the way, you can get rid of the scientific notation with
options(scipen = 999)
I’ll also put dollar signs on the y axis
ggplot(boston2, aes(PctNonLatinxWhite, MedianSFPrice)) +
geom_point(size = 2) +
theme_few() +
scale_y_continuous(labels = dollar) +
geom_smooth(method = lm)
Now look at the correlation:
cor.test(boston2$PctNonLatinxWhite, boston2$MedianSFPrice)
##
## Pearson's product-moment correlation
##
## data: boston2$PctNonLatinxWhite and boston2$MedianSFPrice
## t = 5.7302, df = 16, p-value = 0.00003099
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5721415 0.9305963
## sample estimates:
## cor
## 0.8199816
There’s much more to regression than this, but we don’t have time to go into it. If you’re interested, you can see a nice tutorial by Kaggle data scientist Rachael Tatmen at https://www.kaggle.com/rtatman/regression-challenge-day-1.
Our last viz will be a bar plot. Let’s look at median prices for Boston, Cambridge, Somerville, Brookline, and Everett. We’ll read in a new data set:
cities <- readr::read_csv("data/zillow_median_sfhome_by_city.csv") %>%
filter(City %in% c("Boston", "Cambridge", "Somerville", "Brookline", "Everett"))
## Parsed with column specification:
## cols(
## City = col_character(),
## Metro = col_character(),
## County = col_character(),
## SizeRank = col_double(),
## Price = col_double()
## )
Here’s the base R version, although I tend not to use this for bar charts
barplot(names = cities$City, height = cities$Price)
ggplot makes some nice bar plots, although it can be difficult for me to remember my specific customizations. So, I’ve saved my favorite look as an Rstudio “code snippet”
If we have time, I’ll show you a ggplot code snippet.
First, go to Tools > Global Options > Code. At the bottom, make sure “Enable code snippets” is checked.
Next, run
usethis::edit_rstudio_snippets()
In the file that’s open, add the following somewhere. Make sure the 2nd and third lines start off with a tab indenting them.
snippet my_barplot
ggplot(${1:mydataframe}, aes(x=${2:myxcolname}, y=${3:myycolname})) +
geom_bar(stat="identity", color = "black", fill="#0072B2")
Save and close the r.snippets file.
Now start typing my_barplot somewhere in your upper left RStudio pane and when you see the my_barplot dropdown, hit the tab key and see what happens. Fill in the name of mydataframe, then hit tab and fill in the name of the x column, and again fill in the y column.
ggplot(cities, aes(x=City, y=Price)) +
geom_col(color = "black", fill="#0072B2")
It can be helpful for the viewer to re-order the bars from large to small.
ggplot(cities, aes(x=reorder(City, -Price), y=Price)) +
geom_col(color = "black", fill="#0072B2")
See how to do RStudio code snippets in one of my Do More With R video screencasts: https://youtu.be/h_i__VTSurU.
A couple of other general R resources that might be useful:
Searchable table of almost 200 tasks and how to do them in R, including ggplot # https://smach.github.io/R4JournalismBook/HowDoI.html
Of course, my book https://bit.ly/R4Journalism. Use discount code SCI19 to get 20% off